This initial setup chunk does two things: Sets all code chunks to be displayed (echo = TRUE) in the final HTML output. Loads the NBA dataset from a CSV file into a dataframe named mvp_data

# Load necessary libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.2
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(lubridate)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

This section loads essential R packages: tidyverse: A collection of data manipulation and visualization packages caret: For machine learning and predictive modeling lubridate: For handling date/time data zoo: For time series analysis ggplot2: For creating static visualizations plotly: For interactive visualizations

mvp_data <- mvp_data %>%
  mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))

This replaces missing values in all numeric columns with the median value of that column.

# 3. Create an MVP winner indicator
# Typically, the player with the highest award_share wins MVP
mvp_data <- mvp_data %>%
  group_by(season) %>%
  mutate(is_mvp = award_share == max(award_share)) %>%
  ungroup()

This creates a new binary column is_mvp that identifies the player who won the MVP award in each season (the player with the highest award_share).

# 4. Feature engineering
mvp_data <- mvp_data %>%
  mutate(
    # Points-rebounds-assists composite
    pra = pts_per_g + trb_per_g + ast_per_g,
    
    # Efficiency metrics combination
    efficiency = (ts_pct * 100) + efg_pct * 100,
    
    # Two-way player indicator
    two_way_score = obpm + dbpm,
    
    # Value metrics
    value_composite = (vorp * 5) + ws,
    
    # Team success weight
    team_success = win_loss_pct * 100,
    
    # Games played percentage (approximating)
    games_played_pct = g / 82,
    
    # Position indicators
    is_guard = ifelse(pos %in% c("PG", "SG", "G"), 1, 0),
    is_forward = ifelse(pos %in% c("SF", "PF", "F"), 1, 0),
    is_center = ifelse(pos %in% c("C"), 1, 0)
  )

This creates several new features to better capture player performance:

pra: Combined points, rebounds, and assists per game efficiency: Combined true shooting and effective field goal percentages two_way_score: Combined offensive and defensive box plus/minus value_composite: Weighted combination of VORP and Win Shares team_success: Team’s win percentage (scaled to 0-100) games_played_pct: Percentage of games played in a season Position indicators: Binary variables for player positions

# 5. Normalize features to avoid scale issues
preproc_params <- preProcess(mvp_data %>% select(where(is.numeric)), method = c("center", "scale"))
mvp_data_scaled <- predict(preproc_params, mvp_data)

This standardizes all numeric features (centering around mean 0 with standard deviation 1) to ensure all variables are on the same scale for modeling.

# 6. Create time-based features to capture trends
mvp_data_scaled <- mvp_data_scaled %>%
  group_by(player) %>%
  mutate(
    pts_trend = pts_per_g - lag(pts_per_g, default = first(pts_per_g)),
    ws_trend = ws - lag(ws, default = first(ws))
  ) %>%
  ungroup()

This adds two trend features that capture how a player’s performance changed from the previous season:

pts_trend: Change in points per game ws_trend: Change in win shares

# 7. Split into training and testing datasets
# Let's use data up to 2018 for training and 2019-2022 for testing
train_data <- mvp_data_scaled %>% filter(season <= 1.22)
test_data <- mvp_data_scaled %>% filter(season > 1.22)

This splits the data into training (seasons up to 1.22) and testing (seasons after 1.22) sets.

library(glmnet)      # For LASSO and Ridge regression
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
library(xgboost)     # For XGBoost
## Warning: package 'xgboost' was built under R version 4.3.3
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:plotly':
## 
##     slice
## The following object is masked from 'package:dplyr':
## 
##     slice
library(e1071)       # For Support Vector Machines
## Warning: package 'e1071' was built under R version 4.3.3
library(keras)       # For neural networks
## Warning: package 'keras' was built under R version 4.3.2
library(rpart)       # For decision trees
## Warning: package 'rpart' was built under R version 4.3.3
library(kernlab)     # For Gaussian Process
## Warning: package 'kernlab' was built under R version 4.3.3
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha

This loads additional packages needed for various machine learning algorithms.

correlation_matrix <- cor(
  train_data %>% 
    select(award_share, pts_per_g, trb_per_g, ast_per_g, stl_per_g, blk_per_g,
           ts_pct, ws, vorp, bpm, win_loss_pct, pra, efficiency,
           two_way_score, value_composite, team_success, games_played_pct)
)

# Find correlations with award_share
award_share_correlations <- correlation_matrix[1, -1]
correlation_df <- data.frame(
  Variable = names(award_share_correlations),
  Correlation = abs(award_share_correlations)
)
correlation_df <- correlation_df[order(correlation_df$Correlation, decreasing = TRUE), ]

# Select top features based on correlation
top_features <- correlation_df$Variable[1:10]

This performs feature selection by:

Calculating correlations between key variables and award_share Creating a dataframe of these correlations Sorting to find the variables most strongly correlated with MVP award shares Selecting the top 10 most correlated features

# Create formula with top features
formula_str <- paste("award_share ~", paste(top_features, collapse = " + "))
formula_obj <- as.formula(formula_str)

This creates an R formula object using the top 10 features for model building.

# Define common training control
train_control <- trainControl(
  method = "cv",  # Cross-validation
  number = 5,     # 5-fold
  verboseIter = FALSE
)

This sets up 5-fold cross-validation for model training.

# Create a matrix version of the data for models that require it
x_train <- model.matrix(formula_obj, train_data)[,-1]  # Remove intercept
y_train <- train_data$award_share

x_test <- model.matrix(formula_obj, test_data)[,-1]  # Remove intercept
y_test <- test_data$award_share

This creates matrix versions of the training and testing data, which are required for some algorithms like LASSO and Ridge regression.

# 1. Linear Regression (baseline)
lm_model <- train(
  formula_obj,
  data = train_data,
  method = "lm",
  trControl = train_control
)

# 2. LASSO Regression (L1 regularization)
# This trains a LASSO regression model (linear regression with L1 regularization), which can help with feature selection by shrinking some coefficients to exactly zero.

lasso_model <- train(
  x = x_train,
  y = y_train,
  method = "glmnet",
  trControl = train_control,
  tuneGrid = expand.grid(
    alpha = 1,  # LASSO
    lambda = seq(0.001, 0.1, by = 0.001)
  )
)

# 3. Ridge Regression (L2 regularization)
# This trains a Ridge regression model (linear regression with L2 regularization), which helps prevent overfitting by shrinking coefficients toward zero.

ridge_model <- train(
  x = x_train,
  y = y_train,
  method = "glmnet",
  trControl = train_control,
  tuneGrid = expand.grid(
    alpha = 0,  # Ridge
    lambda = seq(0.001, 0.1, by = 0.001)
  )
)

# 4. Elastic Net (combination of L1 and L2)
# This trains an Elastic Net model, which combines both L1 and L2 regularization.

elastic_net_model <- train(
  x = x_train,
  y = y_train,
  method = "glmnet",
  trControl = train_control,
  tuneGrid = expand.grid(
    alpha = seq(0, 1, by = 0.2),  # Mix of LASSO and Ridge
    lambda = seq(0.001, 0.1, by = 0.005)
  )
)

# 5. Gradient Boosting Machine (still keeping this one)
# This trains a Gradient Boosting Machine model, which creates an ensemble of decision trees.

gbm_model <- train(
  formula_obj,
  data = train_data,
  method = "gbm",
  trControl = train_control,
  verbose = FALSE
)

# 6. Support Vector Regression
svr_model <- train(
  formula_obj,
  data = train_data,
  method = "svmRadial",
  trControl = train_control,
  tuneLength = 5
)

# 7. K-Nearest Neighbors
knn_model <- train(
  formula_obj,
  data = train_data,
  method = "knn",
  trControl = train_control,
  tuneGrid = expand.grid(k = 1:20)
)

# Store models in a list
models <- list(
  LinearRegression = lm_model,
  LASSO = lasso_model,
  Ridge = ridge_model,
  ElasticNet = elastic_net_model,
  GradientBoosting = gbm_model,
  SupportVectorRegression = svr_model,
  KNN = knn_model
)
# Make predictions with each model
predictions <- list()
for (model_name in names(models)) {
  # Handle different prediction methods for different model types
  if (model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
    # Models trained on matrix format
    predictions[[model_name]] <- predict(models[[model_name]], newdata = x_test)
  } else {
    # Models trained on formula
    predictions[[model_name]] <- predict(models[[model_name]], newdata = test_data)
  }
}

This generates predictions for each model on the test data, handling the different input formats required by different models.

# Calculate RMSE for each model
calculate_rmse <- function(pred, actual) {
  sqrt(mean((pred - actual)^2))
}

model_rmse <- sapply(predictions, calculate_rmse, actual = y_test)
print(model_rmse)
##        LinearRegression                   LASSO                   Ridge 
##               0.7921366               0.7925482               0.8000463 
##              ElasticNet        GradientBoosting SupportVectorRegression 
##               0.7925482               0.4952495               0.6320104 
##                     KNN 
##               0.4938155

This calculates the Root Mean Square Error (RMSE) for each model and prints the results.

# Find the best model (lowest RMSE)
best_model_name <- names(which.min(model_rmse))
best_model <- models[[best_model_name]]
cat("Best model:", best_model_name, "with RMSE:", min(model_rmse), "\n")
## Best model: KNN with RMSE: 0.4938155

This identifies the model with the lowest RMSE (best performance).

# Make predictions with the best model
if (best_model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
  test_data$predicted_award_share <- predict(best_model, newdata = x_test)
} else {
  test_data$predicted_award_share <- predict(best_model, newdata = test_data)
}

# For each season, predict the MVP (highest predicted award_share)
predicted_mvps <- test_data %>%
  group_by(season) %>%
  slice_max(order_by = predicted_award_share, n = 1) %>%
  select(season, player, predicted_award_share, award_share, is_mvp)

This identifies the predicted MVP for each season (player with highest predicted award_share).

# Calculate accuracy
actual_mvps <- test_data %>%
  group_by(season) %>%
  slice_max(order_by = award_share, n = 1) %>%
  select(season, player, award_share)

comparison <- merge(predicted_mvps, actual_mvps, by = "season", suffixes = c("_predicted", "_actual"))
correct_predictions <- sum(comparison$player_predicted == comparison$player_actual)
total_seasons <- nrow(comparison)
mvp_accuracy <- correct_predictions / total_seasons
cat("MVP Prediction Accuracy:", mvp_accuracy * 100, "%\n")
## MVP Prediction Accuracy: 50 %

This calculates the percentage of seasons where the model correctly predicted the MVP winner.

# Extract and analyze feature importance for the best model
if (best_model_name == "LinearRegression") {
  # For linear regression
  coefficients <- coef(best_model$finalModel)[-1]  # Remove intercept
  var_importance <- data.frame(
    Variable = names(coefficients),
    Importance = abs(coefficients)
  )
  var_importance <- var_importance[order(var_importance$Importance, decreasing = TRUE), ]
} else if (best_model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
  # For regularized regression
  coefficients <- coef(best_model$finalModel, best_model$bestTune$lambda)[, 1][-1]  # Remove intercept
  var_importance <- data.frame(
    Variable = names(coefficients),
    Importance = abs(coefficients)
  )
  var_importance <- var_importance[order(var_importance$Importance, decreasing = TRUE), ]
} else if (best_model_name == "GradientBoosting") {
  # For GBM
  var_importance <- summary(best_model$finalModel, plotit = FALSE)
  var_importance <- var_importance[order(var_importance$rel.inf, decreasing = TRUE), ]
  names(var_importance)[names(var_importance) == "rel.inf"] <- "Importance"
} else {
  # For other models where we may not have direct feature importance
   var_importance <- correlation_df
  names(var_importance)[names(var_importance) == "Correlation"] <- "Importance"
}

print(var_importance)
##                          Variable Importance
## vorp                         vorp 0.46913580
## value_composite   value_composite 0.45290418
## ws                             ws 0.38803353
## pra                           pra 0.29506484
## pts_per_g               pts_per_g 0.29489024
## ast_per_g               ast_per_g 0.20408176
## trb_per_g               trb_per_g 0.19426497
## bpm                           bpm 0.19305511
## two_way_score       two_way_score 0.19301911
## stl_per_g               stl_per_g 0.18957140
## blk_per_g               blk_per_g 0.15308848
## team_success         team_success 0.13730372
## win_loss_pct         win_loss_pct 0.13730372
## games_played_pct games_played_pct 0.09092698
## ts_pct                     ts_pct 0.08804256
## efficiency             efficiency 0.07878740

This extracts the feature importance from the best model, using different methods depending on the model type.

library(ggplot2)
library(plotly)

# Compare model performance
model_performance <- data.frame(
  Model = names(model_rmse),
  RMSE = model_rmse
)
model_performance <- model_performance[order(model_performance$RMSE), ]

# Model comparison visualization
p1 <- ggplot(model_performance, aes(x = reorder(Model, -RMSE), y = RMSE)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(title = "Model Performance Comparison", x = "Model", y = "RMSE (lower is better)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplotly(p1)
# Feature importance visualization
p2 <- ggplot(head(var_importance, 10), aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title = paste("Feature Importance -", best_model_name), x = "", y = "Importance")

ggplotly(p2)
# Actual vs. Predicted visualization
results <- data.frame(
  Player = test_data$player,
  Season = test_data$season,
  Actual = test_data$award_share,
  Predicted = test_data$predicted_award_share
)

p3 <- ggplot(results, aes(x = Actual, y = Predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Actual vs Predicted MVP Award Shares",
       x = "Actual Award Share",
       y = "Predicted Award Share")

ggplotly(p3)
# Top 10 predicted MVPs visualization
top_predictions <- test_data %>%
  arrange(desc(predicted_award_share)) %>%
  head(10)

p5 <- ggplot(top_predictions, aes(x = reorder(player, predicted_award_share), y = predicted_award_share)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top 10 Predicted MVP Candidates", x = "", y = "Predicted Award Share")

ggplotly(p5)
cat("Best model:", best_model_name, "\n")
## Best model: KNN
cat("Model RMSE:", round(min(model_rmse), 4), "\n")
## Model RMSE: 0.4938
cat("MVP Prediction Accuracy:", round(mvp_accuracy * 100, 1), "%", "\n")
## MVP Prediction Accuracy: 50 %
cat("Top 3 most important features:", "\n")
## Top 3 most important features:
cat("1.", var_importance$Variable[1], "\n")
## 1. vorp
cat("2.", var_importance$Variable[2], "\n")
## 2. value_composite
cat("3.", var_importance$Variable[3], "\n")
## 3. ws

Conclusion

This analysis demonstrates that our model can predict NBA MVP candidates with reasonable accuracy. The r best_model_name model performed best with an RMSE of 0.492. We were able to correctly identify the MVP winner 50% of test seasons. The most important features for predicting MVP status are r var_importance\(Variable[1], r var_importance\)Variable[2], and r var_importance$Variable[3], indicating that these statistics have the strongest influence on MVP voting patterns.

Next Steps

1 Collect more recent data to validate the model on the latest seasons 2 Explore additional features such as media attention metrics 3 Develop an ensemble approach combining multiple top-performing models 4 Create an interactive prediction tool for upcoming seasons